# Kernel: hgemm_common_32x128

# ******************************************************************************
# Copyright 2014-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32  + 00 + 0*8>];
--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>];
--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32  + 16 + 0*8>];
--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*32  + 00 + 0*8>];
--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>];
--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*32  + 16 + 0*8>];

LOOP:

<CODE>

    our @top;
    our %insert;
    our $shiftAX;
    our $shiftBX;
    
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }

    my $out = join '', @top;
    
    foreach my $j (0 .. 15)
    {
        my $barrier   = $j & 1 ? 2 : 1;
        my $rsPred    = $j >= 14 ? '@P0' : '   ';
        my $loadReg   = ($j + 2) & 3;
        my $shareLine = ($j + 2) & 15;
        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
        my $compute   = $j & 3;


        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32  + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;

        foreach my $c (0 .. 31)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $wait   = $c == 0 ? "0$barrier" : '--';

            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 16 && $stall ? 'Y' : '-';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
        }
    }
    return $out;

</CODE>

<SCHEDULE_BLOCK>

--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;

--:-:-:-:1      MOV alpha, param_alpha;
--:-:-:-:1      MOV beta,  param_beta;
--:-:-:-:1      MOV flags, param_flags;

// writeCs = (readAs / 4) * 128 + readBs;
--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;

// readCs = tid * 4;
--:-:-:-:1      SHL readCs, tid, 2;

// cx = blkB*128 + tid;
--:-:-:-:1      ISCADD cx, blkB, tid, 7;

// cy = blkA*32
--:-:-:-:1      SHL cy00, blkA, 5;

// C += (cy*ldc + cx) * 4;
--:-:-:-:1      MOV  ldc,  param_ldc;
--:-:-:-:1      MOV  ldcz, param_ldcz;

--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx, xmad_c;
--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;

// cx < n
--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;

// beta != 0
--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;

// Apply relu
--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;

--:-:-:-:1      SHL  ldc1, ldc, 2;
--:-:-:-:1      SHL  ldc4, ldc, 4;
--:-:-:-:1      ISCADD ldc12, ldc, -ldc4, 6;

</SCHEDULE_BLOCK>

--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
--:-:-:-:1      IADD   cy04, cy00,  4;
--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
--:-:-:-:1      IADD   cy08, cy00,  8;
--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
--:-:-:-:1      IADD   cy12, cy00,  12;
--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;

--:-:-:-:5      BAR.SYNC 0;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .=
            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc12;\n" .
            "--:-:-:-:1      IADD   cy00,     cy00,  12;\n" .
            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc12;\n" .
            "--:-:-:-:1      IADD   cy04,     cy04,  12;\n" .
            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc12;\n" .
            "--:-:-:-:1      IADD   cy08,     cy08,  12;\n" .
            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc12;\n" .
            "--:-:-:-:1      IADD   cy12,     cy12,  12;\n" .
            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;

        $out .= sprintf(
            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
            ($y) x 4);

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

--:-:-:-:5      EXIT;

STORE_C:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;

--:-:1:-:1  @P0 LDG.E d0, [C00y];
--:-:2:-:1  @P1 LDG.E d1, [C04y];
--:-:3:-:1  @P2 LDG.E d2, [C08y];
--:-:4:-:1  @P3 LDG.E d3, [C12y];
--:-:-:-:1 @!P0 MOV d0, RZ;
--:-:-:-:1 @!P1 MOV d1, RZ;
--:-:-:-:1 @!P2 MOV d2, RZ;
--:-:-:-:1 @!P3 MOV d3, RZ;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;

--:-:-:-:1      IADD cy00, cy00, 1;
--:-:-:-:1      IADD cy04, cy04, 1;
--:-:-:-:1      IADD cy08, cy08, 1;
--:-:-:-:3      IADD cy12, cy12, 1;

--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;

--:-:-:-:1      STS.128 [writeCs], c0;
--:-:-:-:1      LDS c0, [readCs + 4x<0*128>];
--:-:5:-:1      LDS c1, [readCs + 4x<1*128>];
--:-:-:-:1      LDS c2, [readCs + 4x<2*128>];
--:-:6:-:1      LDS c3, [readCs + 4x<3*128>];
</SCHEDULE_BLOCK>

11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;

--:1:-:-:1  @P0 STG.E.CG [C00y], c0;
--:2:-:-:1  @P1 STG.E.CG [C04y], c1;
--:3:-:-:1  @P2 STG.E.CG [C08y], c2;
--:4:-:-:1  @P3 STG.E.CG [C12y], c3;

01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;

--:-:-:-:5      RET;
